31 데이터 수집&전처리&데이터베이스에 삽입 사이클2 | ✅ 저자: 이유정(박사)

두번째 크롤링 하기: 카카오맵에 맛집 후기 수집하기(kakao_blog)

import pandas as pd
import sqlalchemy
from kakao_blog import scrap_kakao_place_info
from sqlalchemy import create_engine


# --------------------------------------------------------------
# Database URL 설정: MySQL 데이터베이스에 연결하기 위한 URL입니다.
# 유저명, 비밀번호, 호스트, 포트, DB명, 문자셋 등을 포함해야 합니다.
DB_URL = 'mysql+pymysql://root:DjangoUserPass!123@localhost:3306/restaurant_db?charset=utf8mb4'
# ---------------------------------------------------------------

def get_engine():
    """
    SQLAlchemy 엔진을 생성하여 반환합니다.
    echo=False로 설정하면 쿼리 로그 출력이 비활성화됩니다.
    """
    return create_engine(DB_URL, echo=False)

def crawl_and_save_flat(place_ids):
    """
    주어진 place_ids 리스트를 순회하며 카카오맵 맛집 정보를 크롤링하고,
    태그, 메뉴, 리뷰 데이터를 하나의 DataFrame으로 평탄화한 뒤
    'restaurant_details_flat' 테이블에 추가 저장합니다.

    Args:
        place_ids (list of str): 카카오맵 장소 ID 목록
    """
    # 크롤링 결과 DataFrame을 차곡차곡 쌓아둘 리스트
    all_dfs = []
    
    # place_ids 에 담긴 각 장소 ID에 대해 크롤링 수행
    for pid in place_ids:

		# scrap_kakao_place_info 호출로 장소 정보 딕셔너리 획득
        info = scrap_kakao_place_info(pid)
        
        # 키별 정보 추출
        store_name   = info.get('name')  # 가게 이름
        facilities   = info.get('facilities', []) # 편의시설 리스트
        # 편의시설 리스트가 존재하면 콤마로 연결된 문자열로, 없으면 None
        facility_str = ",".join(facilities) if facilities else None

        # --------------------------------------------------------
        # 1) 태그 정보 DataFrame 생성
        # --------------------------------------------------------
        df_tags = pd.DataFrame({'tag': info.get('tags', [])})
    # DataFrame에 나머지 공통 컬럼을 추가하고, 기타 필드는 None으로 세팅
        df_tags = df_tags.assign(
            place_id=pid,
            restaurant_name=store_name,
            menu=None,
            review=None,
            facility_info=facility_str,
            price=None,
            blog_title=None,
            blog_content=None,
            published_date=None
        )

        # --------------------------------------------------------
        # 2) 메뉴 정보 DataFrame 생성
        # --------------------------------------------------------
        df_menus = pd.DataFrame(info.get('menus', []))
        if not df_menus.empty:
            # 원본 'name' 컬럼을 'menu'로 변경
            df_menus = df_menus.rename(columns={'name': 'menu'})
            # 공통 컬럼 할당 및 NaN 처리
            df_menus = df_menus.assign(
                place_id=pid,
                restaurant_name=store_name,
                tag=None,
                review=None,
                facility_info=facility_str,
                price=df_menus['price'].astype(str).replace('nan', None),
                blog_title=None,
                blog_content=None,
                published_date=None
            )[
                # 저장할 최종 컬럼 순서 지정
     ['place_id','restaurant_name','tag','menu','review',         'facility_info','price','blog_title','blog_content','published_date']
            ]
        else:
            # 메뉴가 없을 경우 동일한 컬럼 구조의 빈 DataFrame 생성
            df_menus = pd.DataFrame(columns=[
   'place_id','restaurant_name','tag','menu','review',   'facility_info','price','blog_title','blog_content','published_date'
            ])

        # --------------------------------------------------------
        # 3) 리뷰 정보 DataFrame 생성
        # --------------------------------------------------------
        df_reviews = pd.DataFrame(
            info.get('review_list', []),
            columns=['review','blog_content','published_date']
        )
        if not df_reviews.empty:
            # 리뷰 컬럼과 공통 컬럼을 결합하여 최종 구조 맞춤
            df_reviews = df_reviews.assign(
                place_id=pid,
                restaurant_name=store_name,
                tag=None,
                menu=None,
                facility_info=facility_str,
                price=None,
                blog_title=df_reviews['review']
            )[
                ['place_id','restaurant_name','tag','menu','review',
                 'facility_info','price','blog_title','blog_content','published_date']
            ]
        else:
            # 리뷰가 없으면 빈 DataFrame 생성
            df_reviews = pd.DataFrame(columns=[
                'place_id','restaurant_name','tag','menu','review',
                'facility_info','price','blog_title','blog_content','published_date'
            ])

        # 태그, 메뉴, 리뷰 DataFrame을 모두 리스트에 추가
        all_dfs.extend([df_tags, df_menus, df_reviews])

    # ------------------------------------------------------------
    # 4) 평탄화된 DataFrame 합치기 및 날짜 컬럼 변환
    # ------------------------------------------------------------
    flat = pd.concat(all_dfs, ignore_index=True)
    # 'published_date' 컬럼을 datetime으로 파싱 후 date 타입으로 변환
    flat['published_date'] = pd.to_datetime(
        flat['published_date'], errors='coerce'
    ).dt.date

    # ------------------------------------------------------------
    # 5) 데이터베이스에 일괄 삽입
    # ------------------------------------------------------------
    with get_engine().begin() as conn:
        flat.to_sql(
            'restaurant_details_flat',  # 타겟 테이블명
            conn,
            if_exists='append',         # 기존 데이터에 추가 모드
            index=False,                # 인덱스 컬럼 제외
            dtype={                    # 각 컬럼별 DB 타입 지정
                'place_id': sqlalchemy.types.VARCHAR(50),
                'restaurant_name': sqlalchemy.types.VARCHAR(100),
                'tag': sqlalchemy.types.VARCHAR(100),
                'menu': sqlalchemy.types.VARCHAR(100),
                'review': sqlalchemy.types.TEXT,
                'facility_info': sqlalchemy.types.TEXT,
                'price': sqlalchemy.types.VARCHAR(50),
                'blog_title': sqlalchemy.types.VARCHAR(255),
                'blog_content': sqlalchemy.types.TEXT,
                'published_date': sqlalchemy.types.DATE
            }
        )

    # 최종 처리 결과 출력
    print(f"Inserted {len(flat)} rows into restaurant_details_flat")

# ----------------------------------------------------------------
# 스크립트 직접 실행 예시:
# if __name__ == '__main__':
#     PLACE_IDS = ['84268936', '7948829']
#     crawl_and_save_flat(PLACE_IDS)

jupyter 크롤링

# 1) 모듈 위치가 프로젝트 루트 아래라면 필요시 경로 추가
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "crawler")))   # crawler_flat.py 가 cwd 에 있을 때

# 2) 함수 임포트
from restaurant_details_flat import crawl_and_save_flat
import kakao_blog

# 3) 실행
PLACE_IDS = ['84268936', '7948829']
crawl_and_save_flat(PLACE_IDS)

← 이전: 30 데이터 수집&전처리&데이터베이스에 삽입 사이클

다음 →: 01 fast API

💡 AI 인사이트

댓글 커뮤니티

검색

31 데이터 수집&전처리&데이터베이스에 삽입 사이클2 | ✅ 저자: 이유정(박사)

Python 코드 실행기

📝 입력값 (자동 생성됨)

📤 실행 결과:

사이트 및 광고 문의